## Package 'mclust' version 5.3
## Type 'citation("mclust")' for citing this R package in publications.
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, cbind, colMeans,
## colnames, colSums, do.call, duplicated, eval, evalq, Filter,
## Find, get, grep, grepl, intersect, is.unsorted, lapply,
## lengths, Map, mapply, match, mget, order, paste, pmax,
## pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
## rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
## tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:tidyr':
##
## expand
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:plyr':
##
## rename
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:plyr':
##
## desc
## Loading required package: XVector
##
## Attaching package: 'XVector'
## The following object is masked from 'package:plyr':
##
## compact
##
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
##
## strsplit
##
## Attaching package: 'kknn'
## The following object is masked from 'package:caret':
##
## contr.dummy
#setwd("C:/Users/Ling/Documents/My Study/STAT5003/Assignment2/predicting-novel-kinase-substrates")
# load full data set
dt.Insulin <- read.delim("datasets/InsulinPhospho.txt")
#dim(dt.Insulin)
# load partially labeled data set
dt.Akt <- read.delim("datasets/Akt_substrates.txt", header = F)
dt.mTOR <- read.delim("datasets/mTOR_substrates.txt", header = F)
#dim(dt.Akt)
#dim(dt.mTOR)
# map labled to the full data set
dt.Insulin$Kinases[dt.Insulin$Identifier %in% dt.Akt$V1] <- "Akt"
dt.Insulin$Kinases[dt.Insulin$Identifier %in% dt.mTOR$V1] <- "mTOR"
unique(dt.Insulin$Kinases)
## [1] NA "mTOR" "Akt"
# generate a subset of the labelled data
dt.labeled <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt' | dt.Insulin$Kinases == 'mTOR'), ]
#setwd("C:/Users/Ling/Documents/My Study/STAT5003/Assignment2/predicting-novel-kinase-substrates")
# load prediction_2016 result
df.Akt <- read.csv(file="datasets/Akt_predictions2016.csv", header=TRUE, sep=",")
df.mTOR <- read.csv(file="datasets/mTOR_predictions2016.csv", header=TRUE, sep=",")
colnames(df.Akt)[1] <- "GeneSymbol"
colnames(df.mTOR)[1] <- "GeneSymbol"
df.Akt %>%
mutate(Identifier = paste0(str_trim(str_to_upper(df.Akt$GeneSymbol), side="both"), ";",str_trim(df.Akt$Phosphorylation.site, side="both"), ";" )) %>%
mutate(Full.model.predict = as.numeric(Full.model.predict)) -> df.Akt
names(df.Akt)[colnames(df.Akt)=="Full.model.predict"] <- "predictResult"
head(df.Akt)
## GeneSymbol Phosphorylation.site Sequence.window Uniprot.ID IPI.ID
## 1 Irs2 303 FRPRSKSQSSGSS B9EJW3 IPI00923679
## 2 Cep131 114 GRRRVASLSKASS B1AXI9 IPI00889961
## 3 Ndrg3 331 ARSRTHSTSSSIG Q9QYF9 IPI00136107
## 4 Flnc 2234 GRERLGSFGSITR Q8VHX6-1 IPI00664670
## 5 Cables1 272 RRCRTLSGSPRPK B2RWU9 IPI00929838
## 6 Rtkn 520 GRPRTFSLDAAPA Q8C6B2-1 IPI00226220
## predictResult Motif.predict Phosphoproteome.predict Delta
## 1 0.9941443 0.9620853 0.9746454 0.5092316
## 2 0.9932284 0.7725118 0.9998626 0.4755840
## 3 0.9900757 0.8578199 0.9847889 0.4960437
## 4 0.9891778 0.7630332 0.9974299 0.4717430
## 5 0.9868492 0.8293839 0.9873219 0.4549283
## 6 0.9832485 0.8104265 0.9775198 0.5168305
## Uniprot.database Identifier
## 1 http://www.uniprot.org/uniprot/B9EJW3 IRS2;303;
## 2 http://www.uniprot.org/uniprot/B1AXI9 CEP131;114;
## 3 http://www.uniprot.org/uniprot/Q9QYF9 NDRG3;331;
## 4 http://www.uniprot.org/uniprot/Q8VHX6-1 FLNC;2234;
## 5 http://www.uniprot.org/uniprot/B2RWU9 CABLES1;272;
## 6 http://www.uniprot.org/uniprot/Q8C6B2-1 RTKN;520;
df.mTOR %>%
mutate(Identifier = paste0(str_trim(str_to_upper(df.mTOR$GeneSymbol), side="both"), ";",str_trim(df.mTOR$Phosphorylation.site, side="both"), ";" )) %>%
mutate(Full.model.predict = as.numeric(Full.model.predict)) -> df.mTOR
names(df.mTOR)[colnames(df.mTOR)=="Full.model.predict"] <- "predictResult"
head(df.mTOR)
## GeneSymbol Phosphorylation.site Sequence.window Uniprot.ID IPI.ID
## 1 Ulk2 772 GRVCVGSPPGPGL Q9QY01 IPI00336256
## 2 C2cd5 295 NQTYSFSPSKSYS Q7TPS5-1 IPI00408171
## 3 Ulk2 781 GPGLGSSPPGAEG Q9QY01 IPI00336256
## 4 Stk11ip 444 LETMGSSPLSTTK Q3TAA7 IPI00467059
## 5 Wdr91 257 PASLSQSPRVGFL Q7TMQ7 IPI00339693
## 6 Oxr1 62 RPPSPASPEGPDT Q8C715 IPI00654216
## predictResult Motif.predict Phosphoproteome.predict Delta
## 1 0.9931176 0.8024691 0.9872836 0.7504053
## 2 0.9872644 0.8024691 0.9721223 0.8133507
## 3 0.9856206 0.7901235 0.9772598 0.8175276
## 4 0.9828952 0.7654321 0.9779556 0.6570651
## 5 0.9813423 0.7283951 0.9961476 0.7576563
## 6 0.9733467 0.8765432 0.9498938 0.8432717
## Uniprot.database Identifier
## 1 http://www.uniprot.org/uniprot/Q9QY01 ULK2;772;
## 2 http://www.uniprot.org/uniprot/Q7TPS5-1 C2CD5;295;
## 3 http://www.uniprot.org/uniprot/Q9QY01 ULK2;781;
## 4 http://www.uniprot.org/uniprot/Q3TAA7 STK11IP;444;
## 5 http://www.uniprot.org/uniprot/Q7TMQ7 WDR91;257;
## 6 http://www.uniprot.org/uniprot/Q8C715 OXR1;62;
head(dt.Insulin)
## Identifier Seq.Window Avg.Fold AUC X15s
## 1 100043914;88; GRHERRSSAEQHS 1.0315122 0.2967288 -0.1858515
## 2 100043914;89; RHERRSSAEQHSE 1.2160532 0.5279128 0.1077492
## 3 1110018J18RIK;18; CAGRVPSPAGSVT 0.2860675 0.5240430 0.7031150
## 4 1110037F02RIK;133; SVDRVISHDRDSP -0.4100882 0.6480379 -0.3962030
## 5 1110037F02RIK;138; ISHDRDSPPPPPP -0.3664219 0.5002719 -0.3962030
## 6 1110037F02RIK;1628; FLSEPSSPGRSKT -0.2148644 0.4370191 -0.2185709
## X30s X1m X2m X5m X10m X20m
## 1 0.5813547 1.88857193 2.1018999 1.8426771 2.287873789 1.3431367
## 2 0.2802415 1.24719923 2.0271668 2.6376657 2.374491409 1.1032924
## 3 -0.0905068 0.04348222 0.3846277 0.4637556 0.425594237 0.3769611
## 4 -0.4970602 -0.37816293 -0.6080681 -0.4343925 -0.445322740 -0.2649039
## 5 -0.2954442 -0.11986335 -0.2672015 -0.3476758 -0.283676544 -0.6626509
## 6 -0.2469075 -0.01866835 -0.3877707 -0.2848602 0.006343619 -0.1311892
## X60m Ins.1 LY Ins.2 MK Kinases
## 1 -1.60756467 0.6526217 0.34193177 2.1570012 2.13792056 <NA>
## 2 -0.04938089 0.6112755 0.04620434 3.0963462 3.22683218 <NA>
## 3 -0.01848881 -0.1988757 -0.14777344 -0.2610874 -0.02748394 <NA>
## 4 -0.25659219 0.1735112 -0.57074291 0.3012657 -0.13305247 <NA>
## 5 -0.55865953 0.2292187 -0.38138545 0.4130244 -0.13305247 <NA>
## 6 -0.43729207 0.2987756 -0.45838604 0.1086583 0.02536763 <NA>
head(dt.labeled)
## Identifier Seq.Window Avg.Fold AUC X15s
## 504 AHNAK;5536; VEAEASSPKGKFS 0.4263345 0.6567462 -0.176054689
## 640 AKT1S1;255; TQQYAKSLPVSVP 0.9899918 0.4360069 -0.007265314
## 646 AKT1S1;318; PRPRLNTSDFQKL 3.5063686 0.2030335 1.793249373
## 761 ANKRD17;2041; PVSSPSSPSPPAQ 0.6492043 0.5362401 0.009309093
## 904 APRF;727; TIDLPMSPRTLDS 0.5521512 0.5233701 -0.426227432
## 1215 AS160;324; FRSRCSSVTGVMQ 2.6444937 0.2217749 1.090046133
## X30s X1m X2m X5m X10m X20m
## 504 -0.21739065 -0.3346839 -0.4451926 0.2553367 1.046421 1.510936
## 640 0.16002791 0.5473494 1.0758795 1.5044226 1.501370 1.589304
## 646 2.94962673 3.6929821 3.6410960 4.0850147 3.898876 3.978123
## 761 -0.01560686 0.2071657 0.3408147 1.0057508 1.257341 1.197354
## 904 -0.96441078 -0.1944668 -0.2400350 1.3990019 1.448948 2.039807
## 1215 2.41608831 2.7808397 2.8150416 3.1520775 2.964196 2.874175
## X60m Ins.1 LY Ins.2 MK Kinases
## 504 1.771303 1.758602 1.2239168 2.176801 1.7279205 mTOR
## 640 1.548847 1.228480 -1.4821297 1.369718 -0.3435128 mTOR
## 646 4.011981 2.443501 0.9018806 2.615299 -0.8495963 Akt
## 761 1.191506 1.030689 0.1924467 0.384713 0.3352992 mTOR
## 904 1.354592 1.835358 1.1196218 1.994797 2.1098954 mTOR
## 1215 3.063486 2.561888 0.3589588 2.661638 -0.6009274 Akt
# remove the identifier column since it is just a phosphorilation site
dt.labeled$Identifier <- NULL
dt.labeled$Kinases <- as.factor(dt.labeled$Kinases) # only two classes so set as a factor
dt.labeled$Kinases <- as.numeric(dt.labeled$Kinases) # only two classes so set as a factor
dt.labeled$Seq.Window <- as.character(dt.labeled$Seq.Window) # convert from factor to character
head(dt.labeled)
## Seq.Window Avg.Fold AUC X15s X30s X1m
## 504 VEAEASSPKGKFS 0.4263345 0.6567462 -0.176054689 -0.21739065 -0.3346839
## 640 TQQYAKSLPVSVP 0.9899918 0.4360069 -0.007265314 0.16002791 0.5473494
## 646 PRPRLNTSDFQKL 3.5063686 0.2030335 1.793249373 2.94962673 3.6929821
## 761 PVSSPSSPSPPAQ 0.6492043 0.5362401 0.009309093 -0.01560686 0.2071657
## 904 TIDLPMSPRTLDS 0.5521512 0.5233701 -0.426227432 -0.96441078 -0.1944668
## 1215 FRSRCSSVTGVMQ 2.6444937 0.2217749 1.090046133 2.41608831 2.7808397
## X2m X5m X10m X20m X60m Ins.1 LY
## 504 -0.4451926 0.2553367 1.046421 1.510936 1.771303 1.758602 1.2239168
## 640 1.0758795 1.5044226 1.501370 1.589304 1.548847 1.228480 -1.4821297
## 646 3.6410960 4.0850147 3.898876 3.978123 4.011981 2.443501 0.9018806
## 761 0.3408147 1.0057508 1.257341 1.197354 1.191506 1.030689 0.1924467
## 904 -0.2400350 1.3990019 1.448948 2.039807 1.354592 1.835358 1.1196218
## 1215 2.8150416 3.1520775 2.964196 2.874175 3.063486 2.561888 0.3589588
## Ins.2 MK Kinases
## 504 2.176801 1.7279205 2
## 640 1.369718 -0.3435128 2
## 646 2.615299 -0.8495963 1
## 761 0.384713 0.3352992 2
## 904 1.994797 2.1098954 2
## 1215 2.661638 -0.6009274 1
str(dt.labeled)
## 'data.frame': 48 obs. of 16 variables:
## $ Seq.Window: chr "VEAEASSPKGKFS" "TQQYAKSLPVSVP" "PRPRLNTSDFQKL" "PVSSPSSPSPPAQ" ...
## $ Avg.Fold : num 0.426 0.99 3.506 0.649 0.552 ...
## $ AUC : num 0.657 0.436 0.203 0.536 0.523 ...
## $ X15s : num -0.17605 -0.00727 1.79325 0.00931 -0.42623 ...
## $ X30s : num -0.2174 0.16 2.9496 -0.0156 -0.9644 ...
## $ X1m : num -0.335 0.547 3.693 0.207 -0.194 ...
## $ X2m : num -0.445 1.076 3.641 0.341 -0.24 ...
## $ X5m : num 0.255 1.504 4.085 1.006 1.399 ...
## $ X10m : num 1.05 1.5 3.9 1.26 1.45 ...
## $ X20m : num 1.51 1.59 3.98 1.2 2.04 ...
## $ X60m : num 1.77 1.55 4.01 1.19 1.35 ...
## $ Ins.1 : num 1.76 1.23 2.44 1.03 1.84 ...
## $ LY : num 1.224 -1.482 0.902 0.192 1.12 ...
## $ Ins.2 : num 2.177 1.37 2.615 0.385 1.995 ...
## $ MK : num 1.728 -0.344 -0.85 0.335 2.11 ...
## $ Kinases : num 2 2 1 2 2 1 1 1 1 1 ...
#install.packages("lattice")
#install.packages("ggplot2")
filteredFeatures <- c("Avg.Fold","AUC","Ins.1","Ins.2","LY","MK","Kinases")
temporalFeatures <- c("X15s","X30s","X1m","X2m","X5m","X10m","X20m","X60m","Kinases")
# try plotting the temporal data
pairs(Kinases~., dt.labeled[,temporalFeatures], col=dt.labeled[,temporalFeatures]$Kinases)
# try plotting the other features
# pair-wise scatterplots colored by class
pairs(Kinases~., dt.labeled[,filteredFeatures], col=dt.labeled[,filteredFeatures]$Kinases)
print("Plots appear to indicate that the labeled data can be separated. Thinking SVM should work.")
## [1] "Plots appear to indicate that the labeled data can be separated. Thinking SVM should work."
# visualise the patterns
library(RColorBrewer)
par(mfrow=c(1, 1), mar=c(3, 3, 3, 3) + 0.1)
barplot(aktSequences.pfm, col=brewer.pal(nrow(aktSequences.pfm), "Paired"),
legend.text = rownames(aktSequences.pfm),
args.legend=list(x=ncol(aktSequences.pfm) + 5,y=max(colSums(aktSequences.pfm))+4,bty = "n"),
main="Akt Sequence Window patterns")
par(mfrow=c(1, 1), mar=c(3, 3, 3, 3) + 0.1)
barplot(mTORSequences.pfm, col=brewer.pal(nrow(mTORSequences.pfm), "Paired"),
legend.text = rownames(mTORSequences.pfm),
args.legend=list(x=ncol(mTORSequences.pfm)+5,y=max(colSums(mTORSequences.pfm))+4,bty = "n"),
main="mTOR Sequence Window patterns")
# Pattern match score
# * Basic Formula
# + With the probabbility matrix (PSSM) it is possible to calculate a total score for a specific sequence
# + The higher the score is, the higher is the probability that the sequence contains the searched motif.
# + Convert the sequence window in the unlabelled dataset to a sum based on the Position-specific Scoring Matrix (PSSM)
library(seqLogo)
## Loading required package: grid
aktSequences.pfm <- consensusMatrix(phosphoSites.Akt.allSequences, as.prob = T)
colnames(aktSequences.pfm) <- colnames(aktSequences)
mTORSequences.pfm <- consensusMatrix(phosphoSites.mTOR.allSequences, as.prob = T)
colnames(mTORSequences.pfm) <- colnames(mTORSequences)
# motif calculations function
getMatchScore <- function(seq, PSSM) {
x <- strsplit(x=seq,split='')
#x
#initialise vector to keep scores
seq_score <- vector()
#get the corresponding values from the PSSM
for (i in 1:nchar(seq)){
if (x[[1]][i] != "_") {
seq_score[i] <- PSSM[x[[1]][i],i]
}
else seq_score[i] = 0.00
}
#seq_score
sum(seq_score)
#max score
#sum(apply(mm,2,max))
}
getPercentageMatch <- function (score, max) {
# normalise score to 0-1
score / max
}
dt.Insulin$AktMotif <- as.numeric(lapply(dt.Insulin$Seq.Window, getMatchScore, PSSM=aktSequences.pfm))
dt.Insulin$mTORMotif <- as.numeric(lapply(dt.Insulin$Seq.Window, getMatchScore, PSSM=mTORSequences.pfm))
maxMotifAkt <- max(dt.Insulin$AktMotif)
minMotifAkt <- min(dt.Insulin$AktMotif)
maxMotifmTOR <- max(dt.Insulin$mTORMotif)
minMotifmTOR <- min(dt.Insulin$mTORMotif)
# normalised score
dt.Insulin$aktMotifMatch <- as.numeric(lapply(dt.Insulin$AktMotif, getPercentageMatch, max=maxMotifAkt))
dt.Insulin$mTORMotifMatch <- as.numeric(lapply(dt.Insulin$mTORMotif, getPercentageMatch, max=maxMotifmTOR))
#View(dt.unlabeled)
head(dt.Insulin)
## Identifier Seq.Window Avg.Fold AUC X15s
## 1 100043914;88; GRHERRSSAEQHS 1.0315122 0.2967288 -0.1858515
## 2 100043914;89; RHERRSSAEQHSE 1.2160532 0.5279128 0.1077492
## 3 1110018J18RIK;18; CAGRVPSPAGSVT 0.2860675 0.5240430 0.7031150
## 4 1110037F02RIK;133; SVDRVISHDRDSP -0.4100882 0.6480379 -0.3962030
## 5 1110037F02RIK;138; ISHDRDSPPPPPP -0.3664219 0.5002719 -0.3962030
## 6 1110037F02RIK;1628; FLSEPSSPGRSKT -0.2148644 0.4370191 -0.2185709
## X30s X1m X2m X5m X10m X20m
## 1 0.5813547 1.88857193 2.1018999 1.8426771 2.287873789 1.3431367
## 2 0.2802415 1.24719923 2.0271668 2.6376657 2.374491409 1.1032924
## 3 -0.0905068 0.04348222 0.3846277 0.4637556 0.425594237 0.3769611
## 4 -0.4970602 -0.37816293 -0.6080681 -0.4343925 -0.445322740 -0.2649039
## 5 -0.2954442 -0.11986335 -0.2672015 -0.3476758 -0.283676544 -0.6626509
## 6 -0.2469075 -0.01866835 -0.3877707 -0.2848602 0.006343619 -0.1311892
## X60m Ins.1 LY Ins.2 MK Kinases
## 1 -1.60756467 0.6526217 0.34193177 2.1570012 2.13792056 <NA>
## 2 -0.04938089 0.6112755 0.04620434 3.0963462 3.22683218 <NA>
## 3 -0.01848881 -0.1988757 -0.14777344 -0.2610874 -0.02748394 <NA>
## 4 -0.25659219 0.1735112 -0.57074291 0.3012657 -0.13305247 <NA>
## 5 -0.55865953 0.2292187 -0.38138545 0.4130244 -0.13305247 <NA>
## 6 -0.43729207 0.2987756 -0.45838604 0.1086583 0.02536763 <NA>
## AktMotif mTORMotif aktMotifMatch mTORMotifMatch
## 1 2.636364 1.192308 0.5321101 0.3522727
## 2 2.545455 1.576923 0.5137615 0.4659091
## 3 2.590909 2.384615 0.5229358 0.7045455
## 4 2.272727 1.615385 0.4587156 0.4772727
## 5 1.363636 2.346154 0.2752294 0.6931818
## 6 2.045455 2.769231 0.4128440 0.8181818
unique(dt.Insulin$Kinases)
## [1] NA "mTOR" "Akt"
max(dt.Insulin$AktMotif)
## [1] 4.954545
min(dt.Insulin$AktMotif)
## [1] 0.04545455
max(dt.Insulin$mTORMotif)
## [1] 3.384615
min(dt.Insulin$mTORMotif)
## [1] 0.1153846
dt.labeled.mTOR <- dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]
# this part reference PengYi's code - KSP-PUEL GUI feature extraction ....
str(dt.Insulin)
## 'data.frame': 12062 obs. of 21 variables:
## $ Identifier : Factor w/ 12062 levels "100043914;88;",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Seq.Window : chr "GRHERRSSAEQHS" "RHERRSSAEQHSE" "CAGRVPSPAGSVT" "SVDRVISHDRDSP" ...
## $ Avg.Fold : num 1.032 1.216 0.286 -0.41 -0.366 ...
## $ AUC : num 0.297 0.528 0.524 0.648 0.5 ...
## $ X15s : num -0.186 0.108 0.703 -0.396 -0.396 ...
## $ X30s : num 0.5814 0.2802 -0.0905 -0.4971 -0.2954 ...
## $ X1m : num 1.8886 1.2472 0.0435 -0.3782 -0.1199 ...
## $ X2m : num 2.102 2.027 0.385 -0.608 -0.267 ...
## $ X5m : num 1.843 2.638 0.464 -0.434 -0.348 ...
## $ X10m : num 2.288 2.374 0.426 -0.445 -0.284 ...
## $ X20m : num 1.343 1.103 0.377 -0.265 -0.663 ...
## $ X60m : num -1.6076 -0.0494 -0.0185 -0.2566 -0.5587 ...
## $ Ins.1 : num 0.653 0.611 -0.199 0.174 0.229 ...
## $ LY : num 0.3419 0.0462 -0.1478 -0.5707 -0.3814 ...
## $ Ins.2 : num 2.157 3.096 -0.261 0.301 0.413 ...
## $ MK : num 2.1379 3.2268 -0.0275 -0.1331 -0.1331 ...
## $ Kinases : chr NA NA NA NA ...
## $ AktMotif : num 2.64 2.55 2.59 2.27 1.36 ...
## $ mTORMotif : num 1.19 1.58 2.38 1.62 2.35 ...
## $ aktMotifMatch : num 0.532 0.514 0.523 0.459 0.275 ...
## $ mTORMotifMatch: num 0.352 0.466 0.705 0.477 0.693 ...
dt.Insulin.feat <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m")]
dt.Insulin.feat$X0s <- 0
head(dt.Insulin.feat)
## Identifier X15s X30s X1m X2m
## 1 100043914;88; -0.1858515 0.5813547 1.88857193 2.1018999
## 2 100043914;89; 0.1077492 0.2802415 1.24719923 2.0271668
## 3 1110018J18RIK;18; 0.7031150 -0.0905068 0.04348222 0.3846277
## 4 1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293 -0.6080681
## 5 1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335 -0.2672015
## 6 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835 -0.3877707
## X5m X10m X20m X60m X0s
## 1 1.8426771 2.287873789 1.3431367 -1.60756467 0
## 2 2.6376657 2.374491409 1.1032924 -0.04938089 0
## 3 0.4637556 0.425594237 0.3769611 -0.01848881 0
## 4 -0.4343925 -0.445322740 -0.2649039 -0.25659219 0
## 5 -0.3476758 -0.283676544 -0.6626509 -0.55865953 0
## 6 -0.2848602 0.006343619 -0.1311892 -0.43729207 0
#### secondary feature 1
# magnitude calculation (mathematical mean)
average.score <- rowSums(dt.Insulin.feat[, 2:9]) / ncol(dt.Insulin.feat[, 2:9])
names(average.score) <- rownames(dt.Insulin.feat)
head(average.score)
## 1 2 3 4 5 6
## 1.0315122 1.2160532 0.2860675 -0.4100882 -0.3664219 -0.2148644
#### secondary feature 2
# temporal profile fitting to check if the profile follows are good trend
fitting.score <- c()
for (i in 1:nrow(dt.Insulin)) {
y <- as.numeric(dt.Insulin[i, 2:10]);
x <- 2:10
x2 = x^2
lmfit <- lm(formula = y ~ x + x2 - 1)
f.stat <- summary(lmfit)$fstatistic
fitting.score <- c(fitting.score, f.stat[1])
}
fitted.score <- log2(fitting.score)
names(fitted.score) <- rownames(dt.Insulin.feat)
# combine extracted secondary features with primary features
dt.Insulin.feat <- cbind(fitted.score, dt.Insulin.feat) # removed avg score since same as avg fold
head(dt.Insulin.feat)
## fitted.score Identifier X15s X30s X1m
## 1 4.4639660 100043914;88; -0.1858515 0.5813547 1.88857193
## 2 4.6421890 100043914;89; 0.1077492 0.2802415 1.24719923
## 3 2.3391666 1110018J18RIK;18; 0.7031150 -0.0905068 0.04348222
## 4 1.7749557 1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293
## 5 1.0122698 1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335
## 6 -0.1060349 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835
## X2m X5m X10m X20m X60m X0s
## 1 2.1018999 1.8426771 2.287873789 1.3431367 -1.60756467 0
## 2 2.0271668 2.6376657 2.374491409 1.1032924 -0.04938089 0
## 3 0.3846277 0.4637556 0.425594237 0.3769611 -0.01848881 0
## 4 -0.6080681 -0.4343925 -0.445322740 -0.2649039 -0.25659219 0
## 5 -0.2672015 -0.3476758 -0.283676544 -0.6626509 -0.55865953 0
## 6 -0.3877707 -0.2848602 0.006343619 -0.1311892 -0.43729207 0
dt.Insulin <- merge(dt.Insulin, dt.Insulin.feat, by=c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m"))
dt.Insulin$X0s <- NULL
head(dt.Insulin)
## Identifier X15s X30s X1m X2m
## 1 100043914;88; -0.1858515 0.5813547 1.88857193 2.1018999
## 2 100043914;89; 0.1077492 0.2802415 1.24719923 2.0271668
## 3 1110018J18RIK;18; 0.7031150 -0.0905068 0.04348222 0.3846277
## 4 1110037F02RIK;133; -0.3962030 -0.4970602 -0.37816293 -0.6080681
## 5 1110037F02RIK;138; -0.3962030 -0.2954442 -0.11986335 -0.2672015
## 6 1110037F02RIK;1628; -0.2185709 -0.2469075 -0.01866835 -0.3877707
## X5m X10m X20m X60m Seq.Window Avg.Fold
## 1 1.8426771 2.287873789 1.3431367 -1.60756467 GRHERRSSAEQHS 1.0315122
## 2 2.6376657 2.374491409 1.1032924 -0.04938089 RHERRSSAEQHSE 1.2160532
## 3 0.4637556 0.425594237 0.3769611 -0.01848881 CAGRVPSPAGSVT 0.2860675
## 4 -0.4343925 -0.445322740 -0.2649039 -0.25659219 SVDRVISHDRDSP -0.4100882
## 5 -0.3476758 -0.283676544 -0.6626509 -0.55865953 ISHDRDSPPPPPP -0.3664219
## 6 -0.2848602 0.006343619 -0.1311892 -0.43729207 FLSEPSSPGRSKT -0.2148644
## AUC Ins.1 LY Ins.2 MK Kinases AktMotif
## 1 0.2967288 0.6526217 0.34193177 2.1570012 2.13792056 <NA> 2.636364
## 2 0.5279128 0.6112755 0.04620434 3.0963462 3.22683218 <NA> 2.545455
## 3 0.5240430 -0.1988757 -0.14777344 -0.2610874 -0.02748394 <NA> 2.590909
## 4 0.6480379 0.1735112 -0.57074291 0.3012657 -0.13305247 <NA> 2.272727
## 5 0.5002719 0.2292187 -0.38138545 0.4130244 -0.13305247 <NA> 1.363636
## 6 0.4370191 0.2987756 -0.45838604 0.1086583 0.02536763 <NA> 2.045455
## mTORMotif aktMotifMatch mTORMotifMatch fitted.score
## 1 1.192308 0.5321101 0.3522727 4.4639660
## 2 1.576923 0.5137615 0.4659091 4.6421890
## 3 2.384615 0.5229358 0.7045455 2.3391666
## 4 1.615385 0.4587156 0.4772727 1.7749557
## 5 2.346154 0.2752294 0.6931818 1.0122698
## 6 2.769231 0.4128440 0.8181818 -0.1060349
# average fold and average score are the same. already provided
# total 16 features approx
predictorsAkt <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","aktMotifMatch","Kinases")
predictorsmTOR <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","mTORMotifMatch","Kinases")
dt.labeled.mTOR <- dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]
dt.unlabelled <- dt.Insulin[which(is.na(dt.Insulin$Kinases)), ]
dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1
dim(dt.feature.selection.akt)
## [1] 22 17
dim(dt.feature.selection.mTOR)
## [1] 26 17
# pick 22 random for akt negatives
set.seed(55)
aktNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.akt))
temp <- dt.unlabelled[aktNeg, predictorsAkt]
temp$Kinases <- -1
dt.feature.selection.akt <- rbind(dt.feature.selection.akt, temp)
temp <- NULL
# pick 26 random mtor negatives
set.seed(65)
mtorNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.mTOR))
temp <- dt.unlabelled[mtorNeg, predictorsmTOR]
temp$Kinases <- -1
dt.feature.selection.mTOR <- rbind(dt.feature.selection.mTOR, temp)
temp <- NULL
# final sets for feature selection exercise
dt.feature.selection.akt$Kinases <- as.factor(dt.feature.selection.akt$Kinases)
dt.feature.selection.mTOR$Kinases <- as.factor(dt.feature.selection.mTOR$Kinases)
colnames(dt.feature.selection.akt)[17] <- "Class"
colnames(dt.feature.selection.mTOR)[17] <- "Class"
dim(dt.feature.selection.akt)
## [1] 44 17
dim(dt.feature.selection.mTOR)
## [1] 52 17
# function for model evaluation
sampleData <- function (positives, featureSpace) {
colnames(positives)[17] <- "Kinases" # change col name
featureSpace <- colnames(positives)
#set.seed(33)
randomsamples <- sample(nrow(dt.unlabelled), size=nrow(positives), replace = FALSE)
#print(randomsamples)
#print(randomsamples)
negatives <- dt.unlabelled[randomsamples, featureSpace]
negatives$Kinases <- -1
# make a balanced set
balanced <- rbind(positives, negatives)
colnames(balanced)[17] <- "Class" # change col name
return(balanced)
}
AnalyseModels <- function (method, data) {
# prepare training scheme utilising LOOCV
control <- trainControl(method="repeatedcv", number=10, repeats=3)
controlLOOCV <- trainControl(method="LOOCV")
model <- NULL
#set.seed(5)
switch(method,
kknn={
model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
},
svmRadial={
tuneGrid <- expand.grid(sigma=seq(0.1,1,0.1),C=seq(0.1,5,0.5))
model <- train(Class~., data=data, method=method, trControl=controlLOOCV,tuneGrid=tuneGrid)
},
svmLinear={
tuneGrid <- expand.grid(C=seq(0.1,5,0.5))
model <- train(Class~., data=data, method=method, trControl=controlLOOCV, tuneGrid=tuneGrid)
},
svmPoly={
tuneGrid <- expand.grid(degree=(1:5),scale=seq(0.1,1,0.1),C=seq(0.1,5,0.5))
model <- train(Class~., data=data, method=method, trControl=controlLOOCV, tuneGrid=tuneGrid)
},
{
model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
}
)
#model <- train(Class~., data=data, method=method, trControl=controlLOOCV)
print(model$bestTune)
# resample data for testing and keep the same positives
train <- sampleData(positives=data[which(data$Class == 1),])
# predict on training set
pred <- predict(model$finalModel, train[,-17])
if (method=="lda") {
rocModel <- roc(train[, 17], as.numeric(pred$class))
}
else {
rocModel <- roc(train[, 17], as.numeric(pred))
}
return(rocModel)
}
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:IRanges':
##
## cov, var
## The following objects are masked from 'package:S4Vectors':
##
## cov, var
## The following object is masked from 'package:BiocGenerics':
##
## var
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
#model <- train(Class~., data=dt.feature.selection.akt, method="svmRadial", trControl=controlLOOCV)
# analyze each model
rocKnn <- AnalyseModels("kknn",dt.feature.selection.akt)
## kmax distance kernel
## 3 9 2 optimal
rocSvmR <- AnalyseModels("svmRadial",dt.feature.selection.akt)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:Biostrings':
##
## type
## The following object is masked from 'package:scales':
##
## alpha
## The following object is masked from 'package:ggplot2':
##
## alpha
## sigma C
## 93 1 1.1
rocSvmL <- AnalyseModels("svmLinear",dt.feature.selection.akt)
## C
## 2 0.6
rocSvmP <- AnalyseModels("svmPoly",dt.feature.selection.akt)
## degree scale C
## 22 1 0.3 0.6
rocLda <- AnalyseModels("lda",dt.feature.selection.akt)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## parameter
## 1 none
#modelGbm <- AnalyseModels("gbm",dt.feature.selection.akt)
plot(rocKnn, legacy.axes = TRUE, col="red", lty=3, main="Akt model performances")
lines(rocSvmR, col="blue", lty=2)
lines(rocSvmL, col="green3", lty=1)
lines(rocSvmP, col="orange", lty=2)
lines(rocLda, col="purple", lty=3)
legend("bottomright", inset = .05, legend=c("KNN", "SVM Radial", "SVM Linear", "SVM Poly", "LDA"), col = c("red", "blue", "green3", "orange","purple"), lty=c(3, 2, 1, 2, 3))
# analyze each model
rocKnn <- AnalyseModels("kknn",dt.feature.selection.mTOR)
## kmax distance kernel
## 3 9 2 optimal
rocSvmR <- AnalyseModels("svmRadial",dt.feature.selection.mTOR)
## sigma C
## 93 1 1.1
rocSvmL <- AnalyseModels("svmLinear",dt.feature.selection.mTOR)
## C
## 1 0.1
rocSvmP <- AnalyseModels("svmPoly",dt.feature.selection.mTOR)
## degree scale C
## 12 1 0.2 0.6
rocLda <- AnalyseModels("lda",dt.feature.selection.mTOR)
## parameter
## 1 none
#modelGbm <- AnalyseModels("gbm",dt.feature.selection.akt)
plot(rocKnn, legacy.axes = TRUE, col="red", lty=3, main="mTOR model performances")
lines(rocSvmR, col="blue", lty=2)
lines(rocSvmL, col="green3", lty=1)
lines(rocSvmP, col="orange", lty=2)
lines(rocLda, col="purple", lty=3)
legend("bottomright", inset = .05, legend=c("KNN", "SVM Radial", "SVM Linear", "SVM Poly", "LDA"), col = c("red", "blue", "green3", "orange","purple"), lty=c(3, 2, 1, 2, 3))
print("SVM Radial yielded a the best overall result although the ROC curve indicates otherwise on the small subset")
## [1] "SVM Radial yielded a the best overall result although the ROC curve indicates otherwise on the small subset"
print("Best Model tuning parameters - AKT")
## [1] "Best Model tuning parameters - AKT"
print("SVM Radial - Sigma: 1, C: 1.1")
## [1] "SVM Radial - Sigma: 1, C: 1.1"
print("Best Model tuning parameters - mTOR")
## [1] "Best Model tuning parameters - mTOR"
print("SVM Radial - Sigma: 1, C: 1.1")
## [1] "SVM Radial - Sigma: 1, C: 1.1"
library(mlbench)
library(caret)
# total 16 features approx
predictorsAkt <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","aktMotifMatch","Kinases")
predictorsmTOR <- c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "fitted.score","mTORMotifMatch","Kinases")
dt.labeled.mTOR <- dt.Insulin[which(dt.Insulin$Kinases == 'mTOR'), ]
dt.labeled.Akt <- dt.Insulin[which(dt.Insulin$Kinases == 'Akt'), ]
dt.unlabelled <- dt.Insulin[which(is.na(dt.Insulin$Kinases)), ]
dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1
dim(dt.feature.selection.akt)
## [1] 22 17
dim(dt.feature.selection.mTOR)
## [1] 26 17
# pick 22 random for akt negatives
aktNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.akt))
temp <- dt.unlabelled[aktNeg, predictorsAkt]
temp$Kinases <- -1
dt.feature.selection.akt <- rbind(dt.feature.selection.akt, temp)
temp <- NULL
# pick 26 random mtor negatives
mtorNeg <- sample(x=1:nrow(dt.unlabelled), size=nrow(dt.feature.selection.mTOR))
temp <- dt.unlabelled[mtorNeg, predictorsmTOR]
temp$Kinases <- -1
dt.feature.selection.mTOR <- rbind(dt.feature.selection.mTOR, temp)
temp <- NULL
# final sets for feature selection exercise
dim(dt.feature.selection.akt)
## [1] 44 17
dim(dt.feature.selection.mTOR)
## [1] 52 17
tstatSort <- function (train) {
train.byClass <- split(train[,-17], train$Kinases)
# perform a t-test
feature.pvalues <- c()
for(i in 1:(ncol(train)-1)) {
feature.pvalues <- c(feature.pvalues, t.test(train.byClass[[1]][,i], train.byClass[[2]][,i])$p.value)
}
names(feature.pvalues) <- colnames(train[,-17])
# filter the top most discriminative feature based on p-values
filtered.features <- names(sort(feature.pvalues)) #[1:10])
filtered.features
}
backwardStepwise <- function(train, cls.train, sortedFeatures) {
test.acc <- c()
# carry out a LOOCV
set.seed(99)
fold <- createFolds(cls.train, k=nrow(train))
# remove the least useful predictor one at a time
for (i in length(sortedFeatures):1) {
current.f <- colnames(train)[i]
features <- sortedFeatures[1:i]
#print(features)
test.accuracies <- c()
for(i in 1:length(fold)) {
truth <- cls.train[fold[[i]]]
trainingData <- train[-fold[[i]], features]
testingData <- train[fold[[i]],features]
#print(truth)
svm.model <- svm(x=trainingData, y=cls.train[-fold[[i]]], kernel="radial", type="C-classification")
pred <- predict(svm.model, testingData)
#print(pred)
accuracy <- sum(pred == truth) / length(truth)
test.accuracies <- c(test.accuracies, accuracy)
}
# take the avg test accuracy as benchmark
test.acc <- c(test.acc, mean(test.accuracies))
}
return(test.acc)
}
# run the random sampling XX times to see if the result is different
randomAccuracies <- function(positives, featureSpace, sortedFeatures, iterations) {
accuracies.all <- data.frame()
#names(accuracies.all) <- seq(length(sortedFeatures), 1)
for (i in 1:iterations) {
# pick 22 random for negatives
set.seed(i)
randomsamples <- sample(nrow(dt.unlabelled), size=nrow(positives), replace = FALSE)
#print(randomsamples)
negatives <- dt.unlabelled[randomsamples, featureSpace]
negatives$Kinases <- -1
# make a balanced set
balanced <- rbind(positives, negatives)
# run the wrapper selection process
accuracies <- backwardStepwise(train = balanced[,-17],
cls.train = balanced[,17],
sortedFeatures = sortedFeatures)
#accuracies <- data.frame(accuracies)
#names(accuracies) <- seq(length(sortedFeatures), 1)
accuracies.all <- rbind(accuracies.all, accuracies)
}
names(accuracies.all) <- seq(length(sortedFeatures), 1)
return(accuracies.all)
}
# carry out t-test to sort features
featuresAkt <- tstatSort(dt.feature.selection.akt)
# sorted features
featuresAkt
## [1] "aktMotifMatch" "Avg.Fold" "X2m" "X10m"
## [5] "X5m" "X1m" "X30s" "X60m"
## [9] "X20m" "AUC" "Ins.1" "fitted.score"
## [13] "Ins.2" "LY" "X15s" "MK"
# best feature as per t-test
featuresAkt[1]
## [1] "aktMotifMatch"
# carry out t-test to sort features
featuresmTOR <- tstatSort(dt.feature.selection.mTOR)
# sorted features
featuresmTOR
## [1] "mTORMotifMatch" "X20m" "Ins.2" "X10m"
## [5] "X60m" "Ins.1" "Avg.Fold" "X5m"
## [9] "MK" "X2m" "LY" "AUC"
## [13] "fitted.score" "X1m" "X15s" "X30s"
# best feature as per t-test
featuresmTOR[1]
## [1] "mTORMotifMatch"
# reset the positives
dt.feature.selection.akt <- dt.labeled.Akt[,predictorsAkt]
dt.feature.selection.akt$Kinases <- 1
dt.feature.selection.mTOR <- dt.labeled.mTOR[,predictorsmTOR]
dt.feature.selection.mTOR$Kinases <- 1
test.accuracies.akt <- randomAccuracies(positives = dt.feature.selection.akt,
featureSpace = predictorsAkt,
sortedFeatures = featuresAkt,
iterations = 100)
test.accuracies.mtor <- randomAccuracies(positives = dt.feature.selection.mTOR,
featureSpace = predictorsmTOR,
sortedFeatures = featuresmTOR,
iterations = 100)
# avg the results
test.accuracies.akt.avg <- colMeans(test.accuracies.akt)
test.accuracies.mtor.avg <- colMeans(test.accuracies.mtor)
plot(rev(test.accuracies.akt.avg),type="l",ylim=c(0.8, 1), xaxt="n", ylab="Test accuracies", xlab="Number of features sorted",main="Feature selection vs model accuracy akt")
axis(1, at=seq(1, 16, by = 1), las=2)
plot(rev(test.accuracies.mtor.avg),type="l",ylim=c(0.5, 1), xaxt="n", ylab="Test accuracies", xlab="Number of features sorted",main="Feature selection vs model accuracy mtor")
axis(1, at=seq(1, 16, by = 1), las=2)
## feature selection results
All the features in both Akt & Mtor classifications yields impressive results. As such all features are important and cannot be dropped.
The purpose here is to identify highly probable negatives for mTORs and Akts respectively so that an ensemble can be built more effectively.
Successful identificaiton of the negative samples will assist in removing bias and also solve the class imbalance problem.
library(MASS)
# helpers for ensemble/bagging
# function to build model
model.func <- function (dt.model.full, dt.labeled, cost.n, degree.n, iterations, dt.pred) {
# instantiate the last probability values
lastProbabilities <- rep(0, nrow(dt.model.full))
iterationCors <- c()
dt.model.full %>%
mutate(Class = -1) %>%
mutate(Class = replace(Class, Identifier %in% dt.labeled$V1, 1)) -> dt.model.full
# data frame to store pred result
dt.result <- dt.model.full[, c("Identifier", "Class")]
for (i in 1:iterations) {
dt.nolabel.sample <- dt.model.full[dt.model.full$Class==-1,][sample(nrow(dt.model.full[dt.model.full$Class==-1, ]), nrow(dt.labeled)),]
dt.model <- rbind(dt.model.full[dt.model.full$Class==1,], dt.nolabel.sample)
# build SVM classification model incl probability
fit.model <- svm(Class ~ ., data=dt.model[, 2:ncol(dt.model)], kernel="radial", type="C-classification", cost=cost.n, degree=degree.n, decision.values = TRUE, probability=TRUE)
pred <- predict(fit.model, dt.pred, decision.values = TRUE, probability = TRUE)
currentProbabilities <- attr(pred, "probabilities")[,1] # attr(pred, "probabilities")[,1]
dt.result[, ncol(dt.result) + 1] <- currentProbabilities
names(dt.result)[ncol(dt.result)] <- paste0("model_", i)
# calculate the correlations and store
currentCor <- cor(currentProbabilities, lastProbabilities)
iterationCors <- c(iterationCors, currentCor)
lastProbabilities <- currentProbabilities
}
dt.final <- data.frame(dt.result[,1:2],pred.Means=rowMeans(dt.result[,3:ncol(dt.result)]))
names(dt.final)[colnames(dt.final)=="pred.Means"] <- "predictResult"
# plot the result of the iterations
plot(iterationCors,type="l",#ylim=c(0, 2),
ylab="correlation",
xlab="Number of iterations",
main="Optimisation point for random sampling")
return(dt.final)
}
# ensemble function (make final predictions)
model.adp2.func <- function (dt.model.full, dt.labeled, dt.neg, cost.n, degree.n, dt.pred) {
dt.model.full %>%
mutate(Class = -1) %>%
mutate(Class = replace(Class, Identifier %in% dt.labeled$V1, 1)) -> dt.model.full
dt.neg$Class = -1
# data frame to store pred result
dt.result <- dt.model.full[, c("Identifier", "Class")]
for (i in 1:1000) {
# generate data for model, combining positive labeled, and the same size selected from most likely negative from adaptive sampling result
dt.model <- rbind(dt.model.full[dt.model.full$Class==1,], dt.neg[sample(nrow(dt.neg), nrow(dt.labeled)),])
# build SVM classification model incl probability
fit.model <- svm(Class ~ ., data=dt.model[, 2:ncol(dt.model)], kernel ="radial", type="C-classification", decision.values = TRUE, probability=TRUE, cost = cost.n, degree = degree.n)
pred <- predict(fit.model, dt.pred, decision.values = TRUE, probability = TRUE)
dt.result[, ncol(dt.result) + 1] <- attr(pred, "probabilities")[,1]
names(dt.result)[ncol(dt.result)] <- paste0("model_", i)
}
dt.final <- data.frame(dt.result[,1:2],pred.Means=rowMeans(dt.result[,3:ncol(dt.result)]))
names(dt.final)[colnames(dt.final)=="pred.Means"] <- "predictResult"
return(dt.final)
}
diff.func <- function (type, dt.original, dt.prediction, title) {
# bootstrap sample statistic (difference in prediction probability)
B=1000
dt.original.B <- rep(0,B)
dt.prediction.B <- rep(0,B)
for ( i in 1:B ) {
dt.original.B[i] = mean(sample(dt.original[, grep(type, colnames(dt.original))], size = nrow(dt.original), replace = TRUE), na.rm = TRUE)
dt.prediction.B[i] = mean(sample(dt.prediction[, grep(type, colnames(dt.prediction))], size = nrow(dt.prediction), replace = TRUE), na.rm = TRUE)
}
b.diff<-(dt.original.B-dt.prediction.B)*100
print(paste("95% Confidence Interval of Average(Bootstrap) difference in ", type, round(quantile(b.diff,0.05),2), round(quantile(b.diff,0.95),2)))
dt.delta <- merge(dt.original[, c(which(colnames(dt.original)=="Identifier"), grep(type, colnames(dt.original)))], dt.prediction[, c(which(colnames(dt.prediction)=="Identifier"), grep(type, colnames(dt.prediction)))], by=("Identifier"))
dt.delta$delta<-(dt.delta[,2]-dt.delta[,3])*100
#dt.delta<-dt.delta[complete.cases(dt.delta$delta),]
ggplot(dat=dt.delta) + geom_histogram(aes(x=delta),bins=50) +ggtitle(paste0(title, " Distribution of prediction difference 2016 minus 2017"))
}
dt.Akt.fulmodel <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "fitted.score")]
aktNancies <- model.func(dt.Akt.fulmodel, dt.Akt, 1.1, 1, 1000, dt.Akt.fulmodel[, 2:17]) # as per best tuning param val
## Warning in cor(currentProbabilities, lastProbabilities): the standard
## deviation is zero
dt.mTOR.fulmodel <- dt.Insulin[, c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "mTORMotif", "fitted.score")]
mTORNancies <- model.func(dt.mTOR.fulmodel, dt.mTOR, 1.1, 1, 1000, dt.mTOR.fulmodel[, 2:17])
## Warning in cor(currentProbabilities, lastProbabilities): the standard
## deviation is zero
# akts
aktNancies[aktNancies$Class==1, ]
## Identifier Class predictResult
## 646 AKT1S1;318; 1 0.9542897
## 1215 AS160;324; 1 0.9689396
## 1219 AS160;595; 1 0.9495719
## 1226 AS160;649; 1 0.9527689
## 1469 BAD;136; 1 0.9449047
## 1913 BRF1;92; 1 0.9563046
## 2101 CARHSP1;53; 1 0.8288359
## 3760 ECNOS;1176; 1 0.9526236
## 3764 EDC3;161; 1 0.8909417
## 4632 FKHR2;252; 1 0.9731397
## 4705 FOXO1;316; 1 0.9799596
## 5155 GSK3A;21; 1 0.9786124
## 5159 GSK3B;9; 1 0.9718666
## 5751 IRS1;265; 1 0.9759471
## 5774 IRS1;522; 1 0.9497301
## 6873 KIAA1272;715; 1 0.9496068
## 9225 PFKFB2;469; 1 0.9497431
## 9226 PFKFB2;486; 1 0.9739178
## 9930 RANBP3;58; 1 0.9470495
## 11565 TSC2;1466; 1 0.9618744
## 11566 TSC2;939; 1 0.9748210
## 11567 TSC2;981; 1 0.9838358
aktNancies[aktNancies$Identifier %in% dt.mTOR$V1, ]
## Identifier Class predictResult
## 504 AHNAK;5536; -1 0.05942054
## 640 AKT1S1;255; -1 0.26232923
## 761 ANKRD17;2041; -1 0.08440179
## 904 APRF;727; -1 0.07826836
## 3465 DEPDC6;265; -1 0.04626077
## 3468 DEPDC6;293; -1 0.24562685
## 3469 DEPDC6;299; -1 0.04826489
## 3926 EIF4EBP1;36; -1 0.07435220
## 3931 EIF4EBP1;64; -1 0.07719166
## 3935 EIF4EBP2;37; -1 0.17481102
## 3936 EIF4EBP2;46; -1 0.14706185
## 4716 FRAP;2481; -1 0.13432078
## 5123 GRB10;503; -1 0.19877614
## 5780 IRS1;632; -1 0.19456065
## 7640 MAF1;60; -1 0.27295991
## 7641 MAF1;68; -1 0.24959177
## 8839 NUMA1;1982; -1 0.17141194
## 9006 PATL1;184; -1 0.25393371
## 9007 PATL1;194; -1 0.08706322
## 9955 RAPTOR;863; -1 0.10394440
## 9956 RAPTOR;877; -1 0.03634466
## 10344 RPS6KB1;427; -1 0.40373478
## 10346 RPS6KB1;444; -1 0.28905449
## 10348 RPS6KB1;452; -1 0.38390202
## 11110 TBC1D10B;693; -1 0.09100056
## 11684 ULK1;763; -1 0.11154273
# negatives subset for ensemble step
Akt.neg <- dt.Akt.fulmodel[which(dt.Akt.fulmodel$Identifier %in%
aktNancies[aktNancies$predictResult < 0.5, ]$Identifier
& !dt.Akt.fulmodel$Identifier %in% dt.Akt$V1), ]
# mtors
mTORNancies[mTORNancies$Class==1, ]
## Identifier Class predictResult
## 504 AHNAK;5536; 1 0.8978276
## 640 AKT1S1;255; 1 0.9278246
## 761 ANKRD17;2041; 1 0.9244890
## 904 APRF;727; 1 0.9184032
## 3465 DEPDC6;265; 1 0.9057565
## 3468 DEPDC6;293; 1 0.9356970
## 3469 DEPDC6;299; 1 0.7058537
## 3926 EIF4EBP1;36; 1 0.7689698
## 3931 EIF4EBP1;64; 1 0.9312038
## 3935 EIF4EBP2;37; 1 0.9096377
## 3936 EIF4EBP2;46; 1 0.8561459
## 4716 FRAP;2481; 1 0.8938383
## 5123 GRB10;503; 1 0.9752241
## 5780 IRS1;632; 1 0.9229491
## 7640 MAF1;60; 1 0.9485292
## 7641 MAF1;68; 1 0.9643523
## 8839 NUMA1;1982; 1 0.9212074
## 9006 PATL1;184; 1 0.9859447
## 9007 PATL1;194; 1 0.9125086
## 9955 RAPTOR;863; 1 0.9252619
## 9956 RAPTOR;877; 1 0.6953893
## 10344 RPS6KB1;427; 1 0.9214529
## 10346 RPS6KB1;444; 1 0.8957045
## 10348 RPS6KB1;452; 1 0.9215228
## 11110 TBC1D10B;693; 1 0.9433266
## 11684 ULK1;763; 1 0.9377368
mTORNancies[mTORNancies$Identifier %in% dt.Akt$V1, ]
## Identifier Class predictResult
## 646 AKT1S1;318; -1 0.5438617
## 1215 AS160;324; -1 0.5125234
## 1219 AS160;595; -1 0.5464693
## 1226 AS160;649; -1 0.5384547
## 1469 BAD;136; -1 0.4340877
## 1913 BRF1;92; -1 0.4834763
## 2101 CARHSP1;53; -1 0.2614398
## 3760 ECNOS;1176; -1 0.5425490
## 3764 EDC3;161; -1 0.3874161
## 4632 FKHR2;252; -1 0.5385921
## 4705 FOXO1;316; -1 0.4885224
## 5155 GSK3A;21; -1 0.4602780
## 5159 GSK3B;9; -1 0.5051196
## 5751 IRS1;265; -1 0.5074499
## 5774 IRS1;522; -1 0.5585685
## 6873 KIAA1272;715; -1 0.5569316
## 9225 PFKFB2;469; -1 0.5204383
## 9226 PFKFB2;486; -1 0.4887327
## 9930 RANBP3;58; -1 0.4842497
## 11565 TSC2;1466; -1 0.4905927
## 11566 TSC2;939; -1 0.4552222
## 11567 TSC2;981; -1 0.4688654
# negative subset for ensemble step, due to high probability value of labeled Akt in mTOR model, we decided to set up threshold at 0.8 for negative class in mTOR model
mTOR.neg <- dt.mTOR.fulmodel[which(dt.mTOR.fulmodel$Identifier %in%
mTORNancies[mTORNancies$predictResult
< 0.8, ]$Identifier &
!dt.mTOR.fulmodel$Identifier %in% dt.mTOR$V1), ]
Akt.adp2 <- model.adp2.func(dt.Akt.fulmodel, dt.Akt, Akt.neg, 1.1, 1, dt.Akt.fulmodel[, 2:17])
Akt.adp2[Akt.adp2$Class==1, ]
## Identifier Class predictResult
## 646 AKT1S1;318; 1 0.9614144
## 1215 AS160;324; 1 0.9746824
## 1219 AS160;595; 1 0.9567079
## 1226 AS160;649; 1 0.9595746
## 1469 BAD;136; 1 0.9547010
## 1913 BRF1;92; 1 0.9648792
## 2101 CARHSP1;53; 1 0.8479057
## 3760 ECNOS;1176; 1 0.9595276
## 3764 EDC3;161; 1 0.9048182
## 4632 FKHR2;252; 1 0.9776503
## 4705 FOXO1;316; 1 0.9841488
## 5155 GSK3A;21; 1 0.9835935
## 5159 GSK3B;9; 1 0.9776482
## 5751 IRS1;265; 1 0.9805084
## 5774 IRS1;522; 1 0.9567083
## 6873 KIAA1272;715; 1 0.9567106
## 9225 PFKFB2;469; 1 0.9567090
## 9226 PFKFB2;486; 1 0.9794170
## 9930 RANBP3;58; 1 0.9569535
## 11565 TSC2;1466; 1 0.9704809
## 11566 TSC2;939; 1 0.9797748
## 11567 TSC2;981; 1 0.9874541
Akt.adp2[Akt.adp2$Identifier %in% dt.mTOR$V1, ]
## Identifier Class predictResult
## 504 AHNAK;5536; -1 0.06712881
## 640 AKT1S1;255; -1 0.29352988
## 761 ANKRD17;2041; -1 0.09196799
## 904 APRF;727; -1 0.09202448
## 3465 DEPDC6;265; -1 0.04832725
## 3468 DEPDC6;293; -1 0.26932414
## 3469 DEPDC6;299; -1 0.04865975
## 3926 EIF4EBP1;36; -1 0.07723848
## 3931 EIF4EBP1;64; -1 0.08462828
## 3935 EIF4EBP2;37; -1 0.18916819
## 3936 EIF4EBP2;46; -1 0.15615188
## 4716 FRAP;2481; -1 0.14830071
## 5123 GRB10;503; -1 0.22155091
## 5780 IRS1;632; -1 0.22916041
## 7640 MAF1;60; -1 0.31483816
## 7641 MAF1;68; -1 0.28315141
## 8839 NUMA1;1982; -1 0.18857993
## 9006 PATL1;184; -1 0.28201796
## 9007 PATL1;194; -1 0.10565903
## 9955 RAPTOR;863; -1 0.12058494
## 9956 RAPTOR;877; -1 0.03446233
## 10344 RPS6KB1;427; -1 0.43448001
## 10346 RPS6KB1;444; -1 0.32069051
## 10348 RPS6KB1;452; -1 0.42111416
## 11110 TBC1D10B;693; -1 0.10607723
## 11684 ULK1;763; -1 0.12062006
mTOR.adp2 <- model.adp2.func(dt.mTOR.fulmodel, dt.mTOR, mTOR.neg, 1.1, 1, dt.mTOR.fulmodel[, 2:17])
mTOR.adp2[mTOR.adp2$Class==1, ]
## Identifier Class predictResult
## 504 AHNAK;5536; 1 0.9289009
## 640 AKT1S1;255; 1 0.9474065
## 761 ANKRD17;2041; 1 0.9437604
## 904 APRF;727; 1 0.9409647
## 3465 DEPDC6;265; 1 0.9302519
## 3468 DEPDC6;293; 1 0.9547459
## 3469 DEPDC6;299; 1 0.7228846
## 3926 EIF4EBP1;36; 1 0.7955299
## 3931 EIF4EBP1;64; 1 0.9522370
## 3935 EIF4EBP2;37; 1 0.9300593
## 3936 EIF4EBP2;46; 1 0.8801363
## 4716 FRAP;2481; 1 0.9178426
## 5123 GRB10;503; 1 0.9871049
## 5780 IRS1;632; 1 0.9494586
## 7640 MAF1;60; 1 0.9669362
## 7641 MAF1;68; 1 0.9794232
## 8839 NUMA1;1982; 1 0.9418279
## 9006 PATL1;184; 1 0.9931401
## 9007 PATL1;194; 1 0.9397145
## 9955 RAPTOR;863; 1 0.9461211
## 9956 RAPTOR;877; 1 0.7034168
## 10344 RPS6KB1;427; 1 0.9417799
## 10346 RPS6KB1;444; 1 0.9243999
## 10348 RPS6KB1;452; 1 0.9418219
## 11110 TBC1D10B;693; 1 0.9630401
## 11684 ULK1;763; 1 0.9575721
mTOR.adp2[mTOR.adp2$Identifier %in% dt.Akt$V1, ]
## Identifier Class predictResult
## 646 AKT1S1;318; -1 0.5496129
## 1215 AS160;324; -1 0.5129993
## 1219 AS160;595; -1 0.5508560
## 1226 AS160;649; -1 0.5417644
## 1469 BAD;136; -1 0.4259434
## 1913 BRF1;92; -1 0.4892282
## 2101 CARHSP1;53; -1 0.2402407
## 3760 ECNOS;1176; -1 0.5474039
## 3764 EDC3;161; -1 0.3740810
## 4632 FKHR2;252; -1 0.5423527
## 4705 FOXO1;316; -1 0.4887895
## 5155 GSK3A;21; -1 0.4592087
## 5159 GSK3B;9; -1 0.5081475
## 5751 IRS1;265; -1 0.5117899
## 5774 IRS1;522; -1 0.5668681
## 6873 KIAA1272;715; -1 0.5676430
## 9225 PFKFB2;469; -1 0.5250197
## 9226 PFKFB2;486; -1 0.4896158
## 9930 RANBP3;58; -1 0.4866869
## 11565 TSC2;1466; -1 0.4916748
## 11566 TSC2;939; -1 0.4534282
## 11567 TSC2;981; -1 0.4680266
print("Our final Akt prediction result with adaptive sampling technic is saved in data frame Akt.adp2")
## [1] "Our final Akt prediction result with adaptive sampling technic is saved in data frame Akt.adp2"
head(Akt.adp2)
## Identifier Class predictResult
## 1 100043914;88; -1 0.66712713
## 2 100043914;89; -1 0.48211721
## 3 1110018J18RIK;18; -1 0.04828794
## 4 1110037F02RIK;133; -1 0.02528055
## 5 1110037F02RIK;138; -1 0.01501212
## 6 1110037F02RIK;1628; -1 0.01935558
print("Our final mTOR prediction result with adaptive sampling technic is saved in data frame mTOR.adp2")
## [1] "Our final mTOR prediction result with adaptive sampling technic is saved in data frame mTOR.adp2"
head(mTOR.adp2)
## Identifier Class predictResult
## 1 100043914;88; -1 0.49566130
## 2 100043914;89; -1 0.60989320
## 3 1110018J18RIK;18; -1 0.26519363
## 4 1110037F02RIK;133; -1 0.04341526
## 5 1110037F02RIK;138; -1 0.05378188
## 6 1110037F02RIK;1628; -1 0.15526402
diff.func ("predictResult", df.Akt, Akt.adp2, "Akt Adp 2 Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in predictResult -0.78 -0.18"
diff.func ("predictResult", df.mTOR, mTOR.adp2, "mTOR Adp 2 Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in predictResult -1.92 -0.87"
# result without adaptive sampling
diff.func ("predictResult", df.Akt, aktNancies, "Akt Full Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in predictResult -0.33 0.25"
diff.func ("predictResult", df.mTOR, mTORNancies, "mTOR Full Model - ")
## [1] "95% Confidence Interval of Average(Bootstrap) difference in predictResult -2.59 -1.64"
# summarize each feature
feat.list <- dt.Insulin[, c("X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "mTORMotif", "fitted.score")]
feat.summary <- sapply(feat.list, summary)
feat.sd <- summarise_all(feat.list, funs(sd))
row.names(feat.sd) <- "sd"
feat.summary <- rbind(feat.summary, feat.sd)
#View(feat.summary)
# plot each feature
par(mfrow=c(2, 2))
for (i in 1:ncol(feat.list)) {
x <- feat.list[, i]
h<-hist(x, breaks=30, col="red", xlab=colnames(feat.summary)[i], main=paste0("Histogram with Normal Curve - ", colnames(feat.summary)[i]))
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
}
print("We can see all features are normally distributed, we can generate simulation data set based on distribution and summary of each feature")
## [1] "We can see all features are normally distributed, we can generate simulation data set based on distribution and summary of each feature"
# generate simulation data
dt.simulation.func <- function(dt){
fullset <- NULL
Identifier <- seq.int(1, 100000, 1)
fullset <- cbind(fullset, Identifier)
for (i in 1:ncol(dt)) {
set.seed(123)
x <- rnorm(n=100000, m=dt[c("Mean"), i], sd=dt[c("sd"), i])
col <- x[x>=dt[c("Min."), i] & x<=dt[c("Max."), i]]
fullset <- cbind(fullset, col)
}
return(fullset)
}
dt.sim <- dt.simulation.func(feat.summary)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
## Warning in .Method(..., deparse.level = deparse.level): number of rows of
## result is not a multiple of vector length (arg 2)
dt.sim <- data.frame(dt.sim)
colnames(dt.sim) <- c("Identifier", "X15s", "X30s", "X1m", "X2m", "X5m", "X10m", "X20m", "X60m", "Avg.Fold", "AUC", "Ins.1", "LY", "Ins.2", "MK", "AktMotif", "mTORMotif", "fitted.score")
#class(dt.sim)
#sapply(dt.sim, class)
#View(dt.sim)
#ncol(dt.sim)
#ncol(feat.summary)
# check if any missing value in simulation data set
sum(!complete.cases(dt.sim))
## [1] 0
dt.simulation.Akt <- dt.sim[sample(1:nrow(dt.sim), size = nrow(dt.Insulin)), -which(names(dt.sim) == "mTORMotif")]
dt.simulation.mTOR <- dt.sim[sample(1:nrow(dt.sim), size = nrow(dt.Insulin)), -which(names(dt.sim) == "AktMotif")]
# final prediction for simulated data
Akt.sim.adp2 <- model.adp2.func(dt.Akt.fulmodel, dt.Akt, Akt.neg, 1.1, 1, dt.simulation.Akt )
mTOR.sim.adp2 <- model.adp2.func(dt.mTOR.fulmodel, dt.mTOR, mTOR.neg, 1.1, 1, dt.simulation.mTOR)
# plot given insulin data prob and simulation data prob for Akt
par(mfrow=c(1, 2))
x <- Akt.sim.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="Akt Simulation Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
x <- Akt.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="Akt Given Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
print("From the plot, we can see Akt prediction results for both simulation data and actual data are following the similar distribution curve ")
## [1] "From the plot, we can see Akt prediction results for both simulation data and actual data are following the similar distribution curve "
# plot given insulin data prob and simulation data prob for mTOR
par(mfrow=c(1, 2))
x <- mTOR.sim.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="mTOR Simulation Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
x <- mTOR.adp2[, c("predictResult")]
h<-hist(x, breaks=30, col="red", xlab="Predict Result", main="mTOR Given Data")
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
print("From the plot, we can see mTOR prediction results for both simulation data and actual data are following the similar distribution curve ")
## [1] "From the plot, we can see mTOR prediction results for both simulation data and actual data are following the similar distribution curve "
sessionInfo()
## R version 3.4.1 (2017-06-30)
## Platform: x86_64-apple-darwin16.6.0 (64-bit)
## Running under: macOS Sierra 10.12.6
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libLAPACK.dylib
##
## locale:
## [1] en_AU.UTF-8/en_AU.UTF-8/en_AU.UTF-8/C/en_AU.UTF-8/en_AU.UTF-8
##
## attached base packages:
## [1] grid stats4 parallel splines stats graphics grDevices
## [8] utils datasets methods base
##
## other attached packages:
## [1] MASS_7.3-47 kernlab_0.9-25 pROC_1.10.0
## [4] seqLogo_1.42.0 RColorBrewer_1.1-2 bindrcpp_0.2
## [7] kknn_1.3.1 Biostrings_2.44.2 XVector_0.16.0
## [10] IRanges_2.10.5 S4Vectors_0.14.7 BiocGenerics_0.22.1
## [13] mlbench_2.1-1 tidyr_0.7.1 stringr_1.2.0
## [16] scales_0.4.1 gbm_2.1.3 survival_2.41-3
## [19] e1071_1.6-8 class_7.3-14 dplyr_0.7.2
## [22] plyr_1.8.4 caret_6.0-77 ggplot2_2.2.1
## [25] lattice_0.20-35 mclust_5.3
##
## loaded via a namespace (and not attached):
## [1] ddalpha_1.3.1 sfsmisc_1.1-1 foreach_1.4.3
## [4] prodlim_1.6.1 assertthat_0.2.0 DRR_0.0.2
## [7] yaml_2.1.14 robustbase_0.92-7 ipred_0.9-6
## [10] backports_1.1.0 glue_1.1.1 digest_0.6.12
## [13] colorspace_1.3-2 recipes_0.1.0 htmltools_0.3.6
## [16] Matrix_1.2-10 timeDate_3012.100 pkgconfig_2.0.1
## [19] CVST_0.2-1 zlibbioc_1.22.0 purrr_0.2.4
## [22] gower_0.1.2 lava_1.5.1 tibble_1.3.4
## [25] withr_2.0.0 nnet_7.3-12 lazyeval_0.2.0
## [28] magrittr_1.5 evaluate_0.10.1 nlme_3.1-131
## [31] dimRed_0.1.0 tools_3.4.1 munsell_0.4.3
## [34] compiler_3.4.1 RcppRoll_0.2.2 rlang_0.1.1
## [37] iterators_1.0.8 igraph_1.1.2 labeling_0.3
## [40] rmarkdown_1.6 gtable_0.2.0 ModelMetrics_1.1.0
## [43] codetools_0.2-15 reshape2_1.4.2 R6_2.2.2
## [46] lubridate_1.6.0 knitr_1.17 bindr_0.1
## [49] rprojroot_1.2 stringi_1.1.5 Rcpp_0.12.12
## [52] rpart_4.1-11 DEoptimR_1.0-8